00_all

Master Document - Breast Cancer Response Analysis

Load Data

Load libraries

library("GEOquery")
library("readr")
library("here")
library("tidyverse")

Save raw data

gse <- getGEO("GSE20194",
              GSEMatrix = TRUE,
              getGPL = FALSE)
Found 1 file(s)
GSE20194_series_matrix.txt.gz
save(gse,
     file = here("data/_raw/raw_data.RData"))

Load raw data

load(here("data/_raw/raw_data.RData"))

eset <- gse[[1]]

Preliminary processing

Convert the ExpressionSet object to a data frame and handle missing or blank values to prevent misalignment.

# Extract sample metadata and add a sample column
# Using %>% instead of |> because %>% allows using '.' as placeholder
dat_load <- as.data.frame(pData(eset))  %>% 
  mutate(
    sample_id = rownames(.)
  )  %>% 
  relocate(sample_id)  %>% 
  mutate(
    across(where(is.factor), as.character)
  )  %>%   
  mutate(
    across(everything(),
           ~ ifelse(. == "" | is.na(.), NA_character_, .))
  )

Save load data

write_tsv(
  dat_load, 
  here("data", "01_dat_load.tsv"), 
  na = "NA")

Clean Data

Load libraries

library("tidyverse")
library("here")

Load data

input_file <- here("data/01_dat_load.tsv")
dat_load <- read_tsv(
  input_file,
  na = c("",
         "NA"),
  show_col_types = FALSE)

Clean data

# Select columns with core characteristics
dat_core <- dat_load |> 
  select(
    geo_accession,
    matches("^characteristics_ch1\\.([1-9]|1[0-4])$"))
dat_clean <- dat_core |>
  
  # Pivot all characteristics_ch1.* columns longer
  pivot_longer(
    cols = starts_with("characteristics_ch1."),
    names_to = "char_num",
    values_to = "char_value") |> 
  
  # Separate colname and value by ": "
  separate(
    char_value, 
    into = c("colname",
             "value"),
    sep = ":\\s+") |> 
  select(
    -("char_num")) |> 
  
  # Handle duplicate entries by keeping the first value
  pivot_wider(
    names_from = colname,
    values_from = value,
    values_fn = function(x) x[1]) |> 
  select(
    -`treatments comments`,
    -`NA`) |>
  
  # Replace 'ND' with NA
  mutate(
    across(everything(),
           ~na_if(.,
                  "ND")))
# Unify variants in histology
dat_clean <- dat_clean |> 
   mutate(
     histology = case_when(
       histology %in% c("ILC/IDC", "IDC+ILC") ~ "IDC/ILC",
       histology %in% c("IDC+DCIS", "IDC/DCIS", "IDC + DCIS") ~ "IDC/DCIS",
       histology %in% c("IDC/anaplastic", "IDC, medullary features") ~ "IDC",
    TRUE ~ histology
  ))

Rename columns

dat_clean <- dat_clean |>
  rename(
    sample_id    = geo_accession,
    response     = pcr_vs_rd,
    tumor_size_before = tbefore,
    nodes_before = nbefore,
    grade        = bmngrd,
    er_percent   = er,
    her2_status  = `her2 status`,
    her2_ihc     = `her2 ihc`,
    her2_fish    = `her2 fish`,
    treatment    = `treatment code`)

Convert data types

dat_clean <- dat_clean |>
  mutate(
    # factor varaibles
    er_status = factor(er_status,
                       levels = c("N",
                                  "P")),
    pr_status = factor(pr_status,
                       levels = c("N",
                                  "P")),
    response = factor(response,
                      levels = c("RD",
                                 "pCR")),
    race = factor(race),
    her2_status = factor(her2_status,
                         levels = c("N",
                                    "P")),
    her2_ihc = factor(her2_ihc,
                      levels = c("0",
                                 "1",
                                 "2",
                                 "3")),
    histology = factor(histology),
    treatment = factor(treatment),
    
    # numeric variables
    age               = as.numeric(age),
    tumor_size_before = as.numeric(tumor_size_before),
    nodes_before      = as.numeric(nodes_before),
    grade             = as.numeric(grade),
    er_percent        = as.numeric(er_percent),
    her2_fish         = as.numeric(her2_fish))

Save clean data

write_tsv(
  dat_clean, 
  here("data",
       "02_dat_clean.tsv"), 
  na = "NA")

Augment Data

Load libraries

library("tidyverse")
library("here")

Load data

input_file <- here("data/02_dat_clean.tsv")
dat_clean <- read_tsv(
  input_file,
  na = c("",
         "NA"),
  show_col_types = FALSE)

Augment data

dat_aug <- dat_clean |> 
  mutate(
    er_num = if_else(
      er_status == "P",
      1,
      0),
    pr_num = if_else(
      pr_status == "P",
      1,
      0),
    her2_num = if_else(
      her2_status == "P",
      1,
      0),
    
    # hormone receptor groups
    hormone_group = paste0(
      if_else(er_num == 1,
              "ER+",
              "ER-"),
      if_else(pr_num == 1,
              "PR+",
              "PR-")),
    
    # three receptor combination
    receptor_group = paste0(
      if_else(er_num == 1,
              "ER+",
              "ER-"),
      if_else(pr_num == 1,
              "PR+",
              "PR-"),
      if_else(her2_num == 1,
              "HER2+",
              "HER2-")),
    
    # triple negative
    triple_negative = case_when(
      er_num == 0 & pr_num == 0 & her2_num == 0 ~ 1,  # TNBC
      TRUE ~ 0))

Save augment data

write_tsv(
  dat_aug, 
  here("data",
       "03_dat_aug.tsv"), 
  na = "NA")

Describe Data

Load Libraries

library("tidyverse")
library("here")
library("RColorBrewer")
source(here("R/99_proj_func.R"))

Load Data

Load the data as a variable to use in descriptive graphs

augment_data <- 
  read_tsv(file = here("data/03_dat_aug.tsv"))

Descriptive Graphs

custom_palette <- c(
  "#7f3667",
  # "#a53e76",
  "#bb437e",
  # "#d24787",
  "#e44b8d",
  "#e2619f",
  # "#e27bb1",
  "#e5a0c6",
  "lightblue",
  "#B3D84C",  
  # "#A6D15A",  
  "#90C84C",  
  "#70B83F",  
  "#4F9A2F",  
  # "#2F7B1F",  
  "#1D6510",  
  "#004400")

Distribution of age groups and race

df_plot1 <- augment_data |>
  mutate(
    age_group = cut(
      age,
      breaks = seq(0,
                   100,
                   by = 10),
      labels = paste(seq(0, 90, 10),
                     seq(9, 99, 10),
                     sep = "-"))) |>
  filter(
    !is.na(age_group)) |>
  count(age_group,
        race)

plot1_pallete <- c("#e44b8d",
                   "#e5a0c6",
                   "lightblue",
                   "#B3D84C",
                   "#70B83F")

gg_4_1 <- ggplot(
  df_plot1,
  aes(
    x = age_group,
    y = n,
    fill = race)) +
  geom_col(position = "stack") +
  labs(
    title = "Distribution of Age Groups and Race",
    x = "Age groups",
    y = "Number of subjects",
    fill = "Races") +
  scale_fill_manual(values = plot1_pallete) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

gg_4_1

Distribution of hormones in different patients

df_plot2 <- augment_data |>
  mutate(
    receptor_group = case_when(
      er_status == "N" & pr_status == "N" & her2_status == "N" ~ "ER-PR-HER2-",
      er_status == "P" & pr_status == "N" & her2_status == "N" ~ "ER+PR-HER2-",
      er_status == "N" & pr_status == "P" & her2_status == "N" ~ "ER-PR+HER2-",
      er_status == "N" & pr_status == "N" & her2_status == "P" ~ "ER-PR-HER2+",
      er_status == "P" & pr_status == "P" & her2_status == "N" ~ "ER+PR+HER2-",
      er_status == "P" & pr_status == "N" & her2_status == "P" ~ "ER+PR-HER2+",
      er_status == "N" & pr_status == "P" & her2_status == "P" ~ "ER-PR-HER2-",
      er_status == "P" & pr_status == "P" & her2_status == "P" ~ "ER+PR+HER2+")) |>
  count(receptor_group)

plot2_palette <- c("#7f3667",
                   "#e44b8d",
                   "#e5a0c6",
                   "lightblue",
                   "#B3D84C", 
                   "#70B83F",
                   "#2F7B1F")

gg_4_2 <- ggplot(
  df_plot2,
  aes(
    x = receptor_group,
    y = n,
    fill = receptor_group)) +
  geom_col() +
  labs(
    title = "Distribution of Hormones",
    x = "Receptor groups",
    y = "Number of occurances",
    fill = "Receptor groups") +
  scale_fill_manual(values = plot2_palette) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 30,
                                   hjust = 0.9,
                                   vjust = 1.2),
        plot.title = element_text(hjust = 0.5))

gg_4_2

Distribution of successful or unsuccessful treatment

df_plot3 <- augment_data |>
  filter(
    !is.na(response)) |>
  count(response)

plot3_palette <- c("#e44b8d",
                   "#90C84C")

gg_4_3 <- ggplot(
  df_plot3,
  aes(
    x = response, 
    y = n,
    fill = response)) +
  geom_col() +
  labs(
    title = "Distribution of Treatment Success: 
             pCR = 'no detectable tumor after treatment'
             RD = 'tumor remains after therapy'",
    x = "Response type", 
    y = "Number of cases",
    fill = "Response type") +
  scale_fill_manual(values = plot3_palette) +
  theme_minimal()

gg_4_3

Distribution of tumor subtypes (histology)

df_plot4 <- augment_data |>
  filter(
    !is.na(histology)) |>
  count(histology)

gg_4_4 <- ggplot(
  df_plot4,
  aes(
    x = histology, 
    y = n,
    fill = histology)) +
  geom_col() +
  labs(
    title = "Distribution of Tumor Subtypes (Histology)",
    x = "Histology",
    y = "Number of cases",
    fill = "Histology") +
  scale_fill_manual(values = custom_palette) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1,
                                   vjust = 1),
        plot.title = element_text(hjust = 0.5))

gg_4_4

Distribution of tumor aggressiveness (bmngrd = Bloom–Richardson (BR) grade)

df_plot5 <- augment_data |>
  mutate(grade = factor(grade)) |>
  count(grade)

plot5_palette <- c("#e44b8d",
                   "lightblue",
                   "#90C84C")

gg_4_5 <- ggplot(
  df_plot5,
  aes(
    x = grade, 
    y = n,
    fill = grade)) +
  geom_col() +
  labs(
    title = "Distribution of Tumor Aggressiveness (BR grade)",
    x = "Grade",
    y = "Number of cases", 
    fill = "Grade") +
  scale_fill_manual(values = plot5_palette, na.value = "gray") +
  theme_minimal() + 
  theme(plot.title = element_text(hjust = 0.5))

gg_4_5

Save plots as png

save_plot(gg_4_1, "04_key_plot_1.png")
save_plot(gg_4_2, "04_plot_2.png")
save_plot(gg_4_3, "04_plot_3.png")
save_plot(gg_4_4, "04_plot_4.png")
save_plot(gg_4_5, "04_plot_5.png")

Analysis

Loading Libraries

library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))

Reading the Data

data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))

Data Wrangling

dataset <- dataset |>
  mutate(
    # creating a new column for the combinations of E and P receptor statuses
    er_pr_status = case_when(
    er_status == "P" & pr_status == "P" ~ "ER Positive, PR Positive",
    er_status == "P" & pr_status == "N" ~ "ER Positive, PR Negative",
    er_status == "N" & pr_status == "P" ~ "ER Negative, PR Positive",
    er_status == "N" & pr_status == "N" ~ "ER Negative, PR Negative"),
    
    # making grade into a factor -> needed for coloring in graphs
    grade = factor(grade, levels = c(1, 2, 3), exclude = NULL))

Plotting

ER & PR Status vs Aggressiveness

receptors_vs_grade <- ggplot(
  dataset,
  aes(
    x = er_pr_status,
    fill = grade)) +
  geom_bar(stat = "count",
           position = "stack") +
  scale_fill_manual(
    values = c("1" = "pink", 
               "2" = "red", 
               "3" = "darkred", 
               "NA" = "gray")) +
  labs(
    title = "ER & PR Status vs Aggressiveness",
    x = "Receptor Status Combination", 
    y = "Amount", 
    fill = "Grade") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        plot.title = element_text(hjust = 0.5))

receptors_vs_grade

Save plot

save_plot(receptors_vs_grade, 
          "05_plot_1.png")

Loading Libraries

library("here") 
library("tidyverse")
source(here("R/99_proj_func.R"))

Reading the Data

data_path <- "data/03_dat_aug.tsv"  
dataset <- read_tsv(file = here(data_path))

Data Wrangling

dataset <- dataset |>
  mutate(
    # making grade into a factor -> needed for coloring in graphs
    grade = factor(grade,
                   levels = c(1, 2, 3),
                   exclude = NULL))

Plotting

HER2 vs Aggressiveness

her2_vs_grade <- ggplot(
  dataset, 
  aes(
    x = her2_status,
    fill = grade)) +
  geom_bar(stat = "count",
           position = "stack") +
  scale_fill_manual(
    values = c("1" = "pink", 
               "2" = "red", 
               "3" = "darkred", 
               "NA" = "gray")) +
  labs(
    title = "HER2 Status vs Aggressiveness",
    x = "HER2 Status", 
    y = "Amount", 
    fill = "Grade") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        plot.title = element_text(hjust = 0.5))

her2_vs_grade

Save plot

save_plot(her2_vs_grade,
          "06_plot_2.png")

Loading Libraries

library("here") 
library("tidyverse")
source(here("R/99_proj_func.R"))

Reading the Data

data_path <- "data/03_dat_aug.tsv"  
dataset <- read_tsv(file = here(data_path))

Data Wrangling

dataset <- dataset |>
  mutate(
    # making tumor_size_before into a factor -> needed for coloring in graphs
    tumor_size_before = factor(
      tumor_size_before,
      levels = c(0, 1, 2, 3, 4),
      exclude = NULL))

Plotting

HER2 vs Tumor Size

her2_vs_tmrsz <- ggplot(
  dataset,
  aes(
    x = her2_status,
    fill = tumor_size_before)) +
  geom_bar(stat = "count",
           position = "stack") +
  scale_fill_manual(
    values = c("0" = "#ADD8E6",
               "1" = "#87CEFA", 
               "2" = "#4682B4", 
               "3" = "#0000CD",
               "4" = "#00008B",
               "NA" = "gray")) +
  labs(
    title = "HER2 Status vs Tumor Size Before Treatment",
    x = "HER2 Status", 
    y = "Amount", 
    fill = "Tumor Size") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        plot.title = element_text(hjust = 0.5))

her2_vs_tmrsz

Save plot

save_plot(her2_vs_tmrsz,
          "07_plot_3.png")

Loading Libraries

library("here")  
library("tidyverse")
source(here("R/99_proj_func.R"))

Reading the Data

data_path <- "data/03_dat_aug.tsv"    
dataset <- read_tsv(file = here(data_path))

Data Wrangling

dataset <- dataset |>   
  mutate(     
    # making nodes_before into a factor -> needed for coloring in graphs
    nodes_before = factor(
      nodes_before,
      levels = c(0, 1, 2, 3), 
      exclude = NULL))

Plotting

HER2 vs Nodes

her2_vs_nodes <- ggplot(
  dataset, 
  aes(
    x = her2_status, 
    fill = nodes_before)) +
  geom_bar(stat = "count", 
           position = "stack") +
  scale_fill_manual(
    values = c("0" = "#90EE90",
               "1" = "#32CD32", 
               "2" = "#228B22", 
               "3" = "#006400",
               "NA" = "gray")) +
  labs(
    title = "HER2 Status vs Node Spread Before Treatment",
    x = "HER2 Status", 
    y = "Amount", 
    fill = "Node Spread") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        plot.title = element_text(hjust = 0.5))

her2_vs_nodes

Save plot

save_plot(her2_vs_nodes, 
          "08_plot_4.png")

Loading Libraries

library("here")   
library("tidyverse")
source(here("R/99_proj_func.R"))

Reading the Data

data_path <- "data/03_dat_aug.tsv"   
dataset <- read_tsv(file = here(data_path))

Data Wrangling

dataset <- dataset |>
  mutate(
    # creating a new column for the combinations of E and P receptor statuses
    er_pr_status = case_when(
      er_status == "P" & pr_status == "P" ~ "ER Positive, PR Positive",
      er_status == "P" & pr_status == "N" ~ "ER Positive, PR Negative",
      er_status == "N" & pr_status == "P" ~ "ER Negative, PR Positive",
      er_status == "N" & pr_status == "N" ~ "ER Negative, PR Negative"),
    
    # making response into factor -> needed for coloring in graphs
    response = factor(
      response,
      levels = c("RD", "pCR"),
      exclude = NULL))

Plotting

ER & PR Status vs Treatment Outcome

receptors_vs_outcome <- ggplot(
  dataset, 
  aes(
    x = er_pr_status,
    fill = response)) +
  geom_bar(stat = "count",
           position = "stack") +
  scale_fill_manual(
    values = c("RD" = "darkred", 
               "pCR" = "#32CD32")) +
  labs(
    title = "ER & PR Status vs Treatment Outcome",
    x = "Receptor Status Combination", 
    y = "Amount", 
    fill = "Response") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        plot.title = element_text(hjust = 0.5))

receptors_vs_outcome

Save plot

save_plot(receptors_vs_outcome,
          "09_plot_5.png")

Loading Libraries

library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))

Reading the Data

data_path <- "data/03_dat_aug.tsv"       
dataset <- read_tsv(file = here(data_path))

Data Wrangling

dataset <- dataset |>   
  mutate(     
    # making response into factor -> needed for coloring in graphs     
    response = factor(
      response,
      levels = c("RD", "pCR"),
      exclude = NULL))

Plotting

HER2 Status vs Treatment Outcome

her2_vs_outcome <- ggplot(
  dataset, 
  aes(
    x = her2_status,
    fill = response)) +
  geom_bar(stat = "count",
           position = "stack") +
  scale_fill_manual(
    values = c("RD" = "darkred", 
               "pCR" = "#32CD32")) +
  labs(
    title = "HER2 Status vs Treatment Outcome",
    x = "HER2 Status", 
    y = "Amount", 
    fill = "Response") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        plot.title = element_text(hjust = 0.5))

her2_vs_outcome

Save plot

save_plot(her2_vs_outcome,
          "10_plot_6.png")

Loading Libraries

library("here") 
library("tidyverse")
source(here("R/99_proj_func.R"))

Reading the Data

data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))

Data Wrangling

dataset <- dataset |>      
  mutate(          
    # making grade and response into factors 
    # -> for coloring graphs and separating NAs in grade
    grade = factor(
      grade,
      levels = c(1, 2, 3),
      exclude = NULL),
    response = factor(
      response,
      levels = c("RD", "pCR"), 
      exclude = NULL))

Plotting

Aggressiveness vs Treatment Outcome

grade_vs_outcome <- ggplot(
  dataset, 
  aes(
    x = grade, 
    fill = response)) +
  geom_bar(stat = "count",
           position = "stack") +
  scale_fill_manual(
    values = c("RD" = "darkred", 
               "pCR" = "#32CD32")) +
  labs(
    title = "Aggressiveness vs Treatment Outcome",
    x = "Grade", 
    y = "Amount", 
    fill = "Response") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        plot.title = element_text(hjust = 0.5))

grade_vs_outcome

Save plot

save_plot(grade_vs_outcome, 
          "11_plot_7.png")

Load libraries

library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))

Load data as variables

input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
  input_file,
  na = c("", "NA"),
  show_col_types = FALSE)

Graphical Comparison

Outcome compared to age

gg_8 <- dat_aug |> 
  ggplot(aes(
    x = response,
    y = age,
    fill = response)) +
  geom_boxplot() +
  geom_jitter(width = 0.15,
              alpha = 0.5,
              size = 1.6) +
  scale_fill_manual(
    values = c("RD" = "blue",
               "pCR" = "magenta")) +
  labs(
    x = "Response",
    y = "Age",
    title = "Distribution of age across outcome") +
  theme_minimal(base_size = 14) + 
  theme(plot.title = element_text(hjust = 0.5))

gg_8
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

Save plot

save_plot(gg_8, 
          "12_plot_8.png")
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

Load libraries

library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))

Load data as variables

input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
  input_file,
  na = c("", "NA"),
  show_col_types = FALSE)

Age and race compared to aggressiveness and Tumor Size

Plot 1

gg_1 <- ggplot(
  dat_aug, 
  aes(x = age, 
      y = grade,
      color = race,
      size = tumor_size_before)) +
  geom_point(alpha = 0.7) +
  scale_size_continuous(name = "Tumor size") +
  labs(
    title = "Age vs Tumor Aggressiveness Across Tumor Size and Race",
    x = "Age",
    y = "Tumor Aggressiveness",
    color = "Race") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

gg_1
Warning: Removed 11 rows containing missing values or values outside the scale range
(`geom_point()`).

Plot 2

gg_2 <- ggplot(
  dat_aug, 
  aes(
    x = factor(grade),
    y = age)) +
  geom_violin(fill = "steelblue", 
              alpha = 0.6,
              color = NA) +
  geom_jitter(
    aes(
      color = race,
      size = tumor_size_before),
    width = 0.15, 
    height = 0,
    alpha = 0.7) +
  labs(
    title = "Age vs Tumor Aggressiveness
    Across Tumor Size Before Treatment and Race",
    x = "Tumor Aggressiveness",
    y = "Age",
    color = "Race",
    size = "Tumore Size Before") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

gg_2
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Plot 3

gg_3 <- ggplot(
  dat_aug,
  aes(
    x = factor(grade), 
    y = age)) +
  geom_boxplot(outlier.shape = NA,
               alpha = 0.4) +
  geom_jitter(
    aes(
      color = race,
      size = tumor_size_before),
    width = 0.15,
    height = 0,
    alpha = 0.7) +
  labs(
    title = "Age vs Tumor Aggressiveness
    Across Tumor Size Before Treatment and Race",
    x = "Tumor Aggressiveness",
    y = "Age",
    color = "Race",
    size = "Tumore Size Before") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

gg_3
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Plot 4

gg_4 <- ggplot(
  dat_aug, 
  aes(
    age, 
    fill = race)) +
  geom_histogram(alpha = 0.5,
                 bins = 20, 
                 position = "identity") +
  facet_wrap(~ grade) +
  theme_minimal() +
  labs(
    title = "Age vs Tumor Aggressiveness
    Across Race, Devided by BR Grade",
    x = "Tumor Aggressiveness",
    y = "Age",
    fill = "Race") +
  theme(plot.title = element_text(hjust = 0.5))

gg_4
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).

Heatmap Age vs. Aggressiveness

Plot 1

df <- dat_aug |>
  mutate(
    age_bin = cut(
      age,
      breaks = seq(20, 
                   90, 
                   by = 5)))

gg_5 <- ggplot(
  df,
  aes(
    x = age,
    y = grade)) +
  stat_bin2d(bins = c(20, 3),
             aes(fill = ..count..)) +
  scale_fill_viridis_c(name = "Count") +
  labs(
    title = "Heatmap Age vs Tumor Aggressiveness",
    x = "Age",
    y = "Aggressiveness") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

gg_5
Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_bin2d()`).

Plot 2

gg_6 <- df |>
  mutate(age_bin = cut(
    age,
    breaks = seq(20,
                 90,
                 5))) |>
  group_by(age_bin,
           grade) |>
  summarise(mean_size = mean(
    tumor_size_before,
    na.rm = TRUE)) |>
  ggplot(aes(
    age_bin,
    factor(grade),
    fill = mean_size)) +
  geom_tile(color = "white") +
  labs(
    title = "Heatmap of Mean Tumor Size by Age & Aggressiveness",
    x = "Age",
    y = "Tumor Aggressiveness",
    fill = "Mean size") +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        plot.title = element_text(hjust = 0.5))
`summarise()` has grouped output by 'age_bin'. You can override using the
`.groups` argument.
gg_6

Plot 3

gg_7 <- ggplot(
  df, 
  aes(
    x = age, 
    y = grade)) +
  stat_density_2d_filled(contour_var = "ndensity") +
  labs(
    title = "2D Density Heatmap: Age vs Tumor Aggressiveness",
    x = "Age",
    y = "Tumor Aggressiveness") +
  theme_minimal(base_size = 14) + 
  theme(plot.title = element_text(hjust = 0.5))

gg_7
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_density2d_filled()`).

Save all plots

save_plot(gg_1, 
          "13_plot_9_1.png")
Warning: Removed 11 rows containing missing values or values outside the scale range
(`geom_point()`).
save_plot(gg_2, 
          "13_plot_9_2.png")
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
save_plot(gg_3, 
          "13_plot_9_3.png")
Warning: Removed 1 row containing non-finite outside the scale range (`stat_boxplot()`).
Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
save_plot(gg_4, 
          "13_plot_9_4.png")
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).
save_plot(gg_5, 
          "13_plot_9_5.png")
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_bin2d()`).
save_plot(gg_6, 
          "13_plot_9_6.png")
save_plot(gg_7, 
          "13_plot_9_7.png")
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_density2d_filled()`).

Load Libraries

library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))

Load Data into variable

input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
  input_file,
  na = c("", "NA"),
  show_col_types = FALSE)

Tumorsize, Lymphnodes and Aggression Comparison

gg_1 <- ggplot(
  dat_aug, 
  aes(
    x = tumor_size_before, 
    y = nodes_before, 
    color = grade)) +
  geom_jitter(width = 0.2,
              height = 0.2,
              size = 3) +
  theme_minimal() +
  labs(
    title = "Tumor size vs. Lymphnodes and Aggression of Tumor",
    x = "Tumor Sizw",
    y = "Lymphnodes") +
  theme(plot.title = element_text(hjust = 0.5))

gg_1
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

gg_2 <- dat_aug |>
  count(tumor_size_before,
        nodes_before, 
        grade) |>
  ggplot(aes(
    x = tumor_size_before, 
    y = nodes_before, 
    fill = n)) +
  geom_tile(color = "white") +
  facet_wrap(~grade) +
  theme_minimal() +
  labs(
    title = "Amount of Tumor Sizes and Lymphnodes according to Aggression",
    x = "Tumorsize",
    y = "Lymphnodes",
    fill = "Amount") +
  theme(plot.title = element_text(hjust = 0.5))

gg_2
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_tile()`).

Save plots

save_plot(gg_1, 
          "14_plot_10_1.png")
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
save_plot(gg_2, 
          "14_plot_10_2.png")
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_tile()`).

Load Libraries

library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))

Load Data

input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
  input_file,
  na = c("", "NA"),
  show_col_types = FALSE)

Response to Treatment according to Receptor Status

gg_11 <- dat_aug |>
  count(receptor_group,
        response) |>
  group_by(receptor_group) |>
  mutate(
    percent = n / sum(n) * 100) |>
  ggplot(aes(
    x = receptor_group,
    y = percent,
    fill = response)) +
  geom_col(position = "stack") +
  theme_minimal() +
  labs(
    title = "Response to Treatment according to Receptor Status",
    x = "Receptors",
    y = "Percent",
    fill = "Response")+
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1,
                                   vjust = 1.2),
        plot.title = element_text(hjust = 0.5))

gg_11

Save plot

save_plot(gg_11, 
          "15_plot_11.png")

Load Libraries

library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))

Load Data

input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
  input_file,
  na = c("", "NA"),
  show_col_types = FALSE)

Plot

gg_1 <- ggplot(
  dat_aug,
  aes(
    x = her2_status,
    y = her2_fish,
    fill = her2_status)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "HER2-FISH and HER2 receptor Status Comparison",
    x = "HER2 Receptor",
    y = "HER2-FISH") +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5))

gg_1
Warning: Removed 49 rows containing non-finite outside the scale range
(`stat_boxplot()`).

gg_2 <- ggplot(
  dat_aug,
  aes(
    x = as.numeric(tumor_size_before), 
    y = her2_fish)) +
  geom_jitter(width = 0.2, 
              alpha = 0.5) +
  geom_smooth(method = "lm", 
              se = TRUE, 
              color = "blue") +
  theme_minimal() +
  labs(
    x = "Tumorsize",
    y = "HER2-FISH",
    title = "Linear Regression: HER2-FISH vs Tumorgsize") +
  theme(plot.title = element_text(hjust = 0.5))

gg_2
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 49 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 49 rows containing missing values or values outside the scale range
(`geom_point()`).

Save plots

save_plot(gg_1, 
          "16_plot_12_1.png")
Warning: Removed 49 rows containing non-finite outside the scale range
(`stat_boxplot()`).
save_plot(gg_2, 
          "16_plot_12_2.png")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 49 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 49 rows containing missing values or values outside the scale range
(`geom_point()`).

Load Libraries

library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))

Load Data

input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
  input_file,
  na = c("", "NA"),
  show_col_types = FALSE)

PCA

Set up the dataset

df_clean <- dat_aug |>
  select(
    her2_num, 
    er_num,
    pr_num, 
    age) |>
  drop_na()

df_scaled <- df_clean |>
  map_dfc(~ (. - mean(.)) / sd(.))

pca_res <- prcomp(df_scaled, 
                  center = FALSE, 
                  scale. = FALSE)

pca_df <- as_tibble(pca_res$x) |>
  bind_cols(df_clean)

Plot PCAs

gg_1 <- pca_df |>
  ggplot(aes(
    x = PC1,
    y = PC2,
    color = factor(er_num))) +
  geom_point(size = 3,
             alpha = 0.7) +
  labs(
    title = "PCA Plot of ER Status",
    color = "ER Status") +
  theme_minimal() + 
  theme(plot.title = element_text(hjust = 0.5))

gg_2 <- pca_df |>
  ggplot(aes(
    x = PC1,
    y = PC2,
    color = factor(pr_num))) +
  geom_point(size = 3, 
             alpha = 0.7) +
  labs(
    title = "PCA Plot of PR Status",
    color = "PR Status") +
  theme_minimal() + 
  theme(plot.title = element_text(hjust = 0.5))

gg_3 <- pca_df |>
  ggplot(aes(
    x = PC1,
    y = PC2, 
    color = factor(her2_num))) +
  geom_point(size = 3, 
             alpha = 0.7) +
  labs(
    title = "PCA Plot of HER2 Status",
    color = "HER2 Status") +
  theme_minimal() + 
  theme(plot.title = element_text(hjust = 0.5))

gg_1

gg_2

gg_3

Save plots

save_plot(gg_1, 
          "17_plot_13_1.png")
save_plot(gg_2, 
          "17_plot_13_2.png")
save_plot(gg_3, 
          "17_plot_13_3.png")

Running Code

Loading libraries:

library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))

Read data:

dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))

Re-ordering

Ordering the different genotypes into groups from the augmented data:

# Create ER/PR/HER2 combo groups
dataset <- dataset |>
  mutate(
    ER = if_else(
      er_status == "P", 
      "ER+",
      "ER-"),
    PR = if_else(
      pr_status == "P",
      "PR+", 
      "PR-"),
    HER2 = if_else(
      her2_status == "P",
      "HER2+",
      "HER2-"),
    combo = paste(
      ER,
      PR,
      HER2,
      sep = ", ")) |>
  mutate(
    combo = factor(
      combo,
      levels = c(
        "ER+, PR-, HER2-",
        "ER+, PR+, HER2-",
        "ER+, PR-, HER2+",
        "ER+, PR+, HER2+",
        "ER-, PR+, HER2-",
        "ER-, PR+, HER2+",
        "ER-, PR-, HER2+",
        "ER-, PR-, HER2-")))

# check: how many rows total?
nrow(dataset)
[1] 278
# check: how many patients in each genotype?
dataset |> 
  count(combo)
# A tibble: 8 × 2
  combo               n
  <fct>           <int>
1 ER+, PR-, HER2-    46
2 ER+, PR+, HER2-    94
3 ER+, PR-, HER2+     8
4 ER+, PR+, HER2+    16
5 ER-, PR+, HER2-     8
6 ER-, PR+, HER2+     3
7 ER-, PR-, HER2+    32
8 ER-, PR-, HER2-    71

Graph 1:

Response (pCR vs RD) by ER/PR/HER2 Gentypes

pCR = pathological complete response, meaning no residual invasive cancer found in the breast or lymph nodes at the time of surgery, It means the treatment eliminated all detectable invasive tumor. This is a strong positive prognostic indicator.

RD = residual disease, meaning invasive tumor remains in the breast and/or lymph nodes after treatment. It means the cancer did not completely respond to treatment. Indicates relative treatment resistance. RD is considered a non-responder outcome.

# total number of patients
N_total <- nrow(dataset)

# Count N per genotype (combo) group
group_counts_resp <- dataset |>
  group_by(combo) |>
  summarise(N_group = n(),
            .groups = "drop")

custom_palette <- c("#e44b8d",
                    "#90C84C")

gg_14 <- ggplot(
  dataset, 
  aes(
    x = combo,
    fill = response)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent_format()) +
  
  # show all 8 levels even if some are empty
  scale_x_discrete(drop = FALSE) +
  
  # add N labels above each bar
  geom_text(
    data = group_counts_resp,
    aes(
      x = combo,
      y = 1.05,
      label = paste0("N=", N_group)),
    size = 3,
    inherit.aes = FALSE) +
  labs(
    title = "Response (pCR=good vs RD=bad) by ER/PR/HER2 Genotypes",
    x = "ER/PR/HER2 Genotypes",
    y = paste0("Proportion of Patients (N = ", N_total, ")"),
    fill = "Response") +
  scale_fill_manual(values = custom_palette) +
  theme_minimal() +
  theme(axis.text.x  = element_text(angle = 45,
                                    hjust = 1,
                                    size = 8),
        plot.title   = element_text(hjust = 0.5),
        axis.title.x = element_text(vjust = 0)) +
  coord_cartesian(ylim = c(0, 1.1))

gg_14

Save plot
save_plot(gg_14,
          "18_key_plot_14.png")
Conclusions:

Estrogen receptor-positive breast cancer patients have a worse treatment response (Graph 1).

Running Code

Loading libraries:

library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))

Read data:

dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))

Re-ordering

Ordering the different genotypes into groups from the augmented data:

# Create ER/PR/HER2 combo groups
dataset <- dataset |>
  mutate(
    ER = if_else(
      er_status == "P",
      "ER+",
      "ER-"),
    PR = if_else(
      pr_status == "P", 
      "PR+", 
      "PR-"),
    HER2 = if_else(
      her2_status == "P",
      "HER2+",
      "HER2-"),
    combo = paste(
      ER, 
      PR,
      HER2,
      sep = ", ")) |>
  mutate(
    combo = factor(
      combo,
      levels = c(
        "ER+, PR-, HER2-",
        "ER+, PR+, HER2-",
        "ER+, PR-, HER2+",
        "ER+, PR+, HER2+",
        "ER-, PR+, HER2-",
        "ER-, PR+, HER2+",
        "ER-, PR-, HER2+",
        "ER-, PR-, HER2-")))

# check: how many rows total?
nrow(dataset)
[1] 278
# check: how many patients in each genotype?
dataset |> 
  count(combo)
# A tibble: 8 × 2
  combo               n
  <fct>           <int>
1 ER+, PR-, HER2-    46
2 ER+, PR+, HER2-    94
3 ER+, PR-, HER2+     8
4 ER+, PR+, HER2+    16
5 ER-, PR+, HER2-     8
6 ER-, PR+, HER2+     3
7 ER-, PR-, HER2+    32
8 ER-, PR-, HER2-    71

Graph 2:

Tumor Grade (1–3) by ER/PR/HER2 Genotypes
# Prepare data with grade as factor (1,2,3,NA)
dat_grade <- dataset |>
  mutate(
    grade = as.character(grade),
    grade = if_else(is.na(grade),
                    "NA",
                    grade),
    grade = factor(grade,
                   levels = c("1", "2", "3", "NA")))

# total N (same as before, but explicit)
N_total_grade <- nrow(dat_grade)

# N per genotype
group_counts_grade <- dat_grade |>
  group_by(combo) |>
  summarise(N_group = n(),
            .groups = "drop")

custom_palette <- c("#e44b8d",
                    "lightblue",
                    "#90C84C",
                    "gray")

gg_15 <- ggplot(
  dat_grade,
  aes(
    x = combo,
    fill = grade)) +
  geom_bar(position = "fill") +
  scale_x_discrete(drop = FALSE) +
  scale_y_continuous(labels = percent_format()) +
  
  # N labels above bars
  geom_text(
    data = group_counts_grade,
    aes(
      x = combo,
      y = 1.05,
      label = paste0("N=",
                     N_group)),
    size = 3,
    inherit.aes = FALSE) +
  labs(
    title = "Tumor Grade (1–3) by ER/PR/HER2 Genotypes",
    x = "ER/PR/HER2 Genotypes",
    y = paste0("Proportion of Patients (N = ", N_total_grade, ")"),
    fill = "Grade") +
  scale_fill_manual(values = custom_palette) +
  theme_minimal() +
  theme(axis.text.x  = element_text(angle = 45,
                                    hjust = 1,
                                    size = 8),
        plot.title   = element_text(hjust = 0.5),
        axis.title.x = element_text(vjust = 0)) +
  coord_cartesian(ylim = c(0, 1.1))

gg_15

Save plot
save_plot(gg_15,
          "19_key_plot_15.png")
Conclusions:

Even tough being postitive for Estrogen receptors do have the worst outcome (Graph 1), they do not have a more agressive tumor grade (Graph 2). Being negative for both hormone receptors leads to a more aggressive tumor grade (Graph 2), and those two actually had the worst outcome (Graph 1).

Running Code

Loading libraries:

library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))

Read data:

dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))

Re-ordering

Ordering the different genotypes into groups from the augmented data:

# Create ER/PR/HER2 combo groups
dataset <- dataset |>
  mutate(
    ER = if_else(
      er_status == "P", 
      "ER+",
      "ER-"),
    PR = if_else(
      pr_status == "P",
      "PR+", 
      "PR-"),
    HER2 = if_else(
      her2_status == "P",
      "HER2+",
      "HER2-"),
    combo = paste(
      ER,
      PR,
      HER2,
      sep = ", ")) |>
  mutate(
    combo = factor(
      combo,
      levels = c(
        "ER+, PR-, HER2-",
        "ER+, PR+, HER2-",
        "ER+, PR-, HER2+",
        "ER+, PR+, HER2+",
        "ER-, PR+, HER2-",
        "ER-, PR+, HER2+",
        "ER-, PR-, HER2+",
        "ER-, PR-, HER2-")))

# check: how many rows total?
nrow(dataset)
[1] 278
# check: how many patients in each genotype?
dataset |> 
  count(combo)
# A tibble: 8 × 2
  combo               n
  <fct>           <int>
1 ER+, PR-, HER2-    46
2 ER+, PR+, HER2-    94
3 ER+, PR-, HER2+     8
4 ER+, PR+, HER2+    16
5 ER-, PR+, HER2-     8
6 ER-, PR+, HER2+     3
7 ER-, PR-, HER2+    32
8 ER-, PR-, HER2-    71

Graph 3:

Prepare for heatmap plot to look at the tumor size (0-4) among genotypes:
# 1. Make a clean tumor-size variable with NA as category
dataset_ts <- dataset |>
  mutate(
    ts = ifelse(
      is.na(tumor_size_before),
      "NA",
      as.character(tumor_size_before)))

# all genotype levels (from your combo factor) and all tumor-size levels
all_combos <- levels(dataset$combo)
ts_levels  <- c("0", "1", "2", "3", "4", "NA")    

# 2. Create full grid: every combo × every tumor-size level
full_grid_ts <- 
  expand.grid(
    combo = all_combos,
    ts = ts_levels,
    stringsAsFactors = FALSE)

# 3. Summary: counts & percentages per combo × tumor size
ts_summary <- 
  full_grid_ts |>
  left_join(
    dataset_ts |> 
      count(combo, ts),
    by = c("combo", "ts")) |>
  mutate(
    n = ifelse(
      is.na(n),
      0,
      n)) |>
  group_by(combo) |>
  mutate(
    total = sum(n),
    pct = ifelse(
      total > 0,
      n / total,
      0)) |>
  ungroup()

# factor for tumor size (plot rows)
ts_summary$ts <- factor(
  ts_summary$ts,
  levels = ts_levels)

# 4. N per combo (for labels) 
group_counts_ts <- 
  ts_summary |>
  distinct(combo,
           total) |>
  rename(N_group = total)

# 5. Extended factor for plotting: add an extra row "N" under 0 just for labels
ts_levels_with_N <- c(ts_levels, "N")  

ts_summary <- ts_summary |>
  mutate(
    ts_plot = factor(
      as.character(ts),
      levels = ts_levels_with_N))

n_label_df <- group_counts_ts |>
  mutate(
    ts_plot = factor(
      "N",
      levels = ts_levels_with_N))
Graph - Heatmap plot - Tumor Size Before Treatment by ER/PR/HER2 Genotypes
gg_16 <- ggplot() +
  # heatmap tiles
  geom_tile(
    data = ts_summary,
    aes(
      x = combo,
      y = ts_plot,
      fill = pct)) +

  # percentage labels inside tiles
  geom_text(
    data = ts_summary,
    aes(
      x = combo,
      y = ts_plot,
      label = paste0(round(pct * 100),
                     "%")),
    size = 3) +

  # N labels in their own row under the heatmap
  geom_text(
    data = n_label_df,
    aes(
      x = combo,
      y = ts_plot,
      label = paste0("N=", 
                     N_group)),
    size = 3) +
  
  scale_fill_gradient(
    low   = "#e5a0c6",
    high  = "#4F9A2F",
    labels = percent_format(accuracy = 1),
    name  = "Proportion") +

  # keep all genotypes, even if some have few patients
  scale_x_discrete(drop = FALSE) +

  # y labels: hide label for the extra "N" row with ""
  scale_y_discrete(
    breaks = ts_levels_with_N,
    labels = c("0", "1", "2", "3", "4", "NA", "")) +

  labs(
    title = "Tumor Size Before Treatment by ER/PR/HER2 Genotypes",
    x     = "ER/PR/HER2 Genotypes",
    y     = "Tumor size before surgery") +
  theme_minimal() +
  theme(axis.text.x  = element_text(angle = 45, 
                                    hjust = 1, 
                                    size = 8),
        plot.title   = element_text(hjust = 0.5),
        axis.title.x = element_text(vjust = 0))

gg_16

Save plot
save_plot(gg_16,
          "20_key_plot_16.png")
Conclusion:

Far most of the genotype groups have a size 2 tumor stage, or otherwise they have developed into size 3-4. This is especially the case for groups that are Estrogen receptor positive, and the last two groups which are negative for both hormone receptors.

Running Code

Loading libraries:

library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))

Read data:

dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))

Re-ordering

Ordering the different genotypes into groups from the augmented data:

# Create ER/PR/HER2 combo groups
dataset <- dataset |>
  mutate(
    ER = if_else(
      er_status == "P", 
      "ER+",
      "ER-"),
    PR = if_else(
      pr_status == "P",
      "PR+", 
      "PR-"),
    HER2 = if_else(
      her2_status == "P",
      "HER2+",
      "HER2-"),
    combo = paste(
      ER,
      PR,
      HER2,
      sep = ", ")) |>
  mutate(
    combo = factor(
      combo,
      levels = c(
        "ER+, PR-, HER2-",
        "ER+, PR+, HER2-",
        "ER+, PR-, HER2+",
        "ER+, PR+, HER2+",
        "ER-, PR+, HER2-",
        "ER-, PR+, HER2+",
        "ER-, PR-, HER2+",
        "ER-, PR-, HER2-")))

# check: how many rows total?
nrow(dataset)
[1] 278
# check: how many patients in each genotype?
dataset |> 
  count(combo)
# A tibble: 8 × 2
  combo               n
  <fct>           <int>
1 ER+, PR-, HER2-    46
2 ER+, PR+, HER2-    94
3 ER+, PR-, HER2+     8
4 ER+, PR+, HER2+    16
5 ER-, PR+, HER2-     8
6 ER-, PR+, HER2+     3
7 ER-, PR-, HER2+    32
8 ER-, PR-, HER2-    71

Graph 4:

Prepare for a bubble plot to look at the lymph nodes among genotype groups:
# 1. Clean node categories (0–3 + NA)
dat_nodes <- dataset |>
  mutate(
    nodes_cat = if_else(
      is.na(nodes_before),
      "NA",
      as.character(nodes_before)))

# 2. Summarise counts and percentages per combo × node category
nodes_summary <- dat_nodes |>
  count(combo, 
        nodes_cat,
        name = "n") |>
  group_by(combo) |>
  mutate(
    total = sum(n),
    pct   = n / total) |>
  ungroup()

# 3. N per combo (for labels)
group_counts_nodes <- nodes_summary |>
  distinct(combo,
           total) |>
  rename(N_group = total)

# 4. Y-axis levels for plotting:
#    bottom → top: 3, 2, 1, 0, NA, N(row for labels)
y_levels <- c("0", "1", "2", "3", "NA", "N")

# Add plotting factor for nodes
nodes_summary <- nodes_summary |>
  mutate(
    nodes_y = factor(
      nodes_cat,
      levels = y_levels))

# Data frame for N labels row
n_label_df <- group_counts_nodes |>
  mutate(
    nodes_y = factor(
      "N", 
      levels = y_levels))

head(nodes_summary)
# A tibble: 6 × 6
  combo           nodes_cat     n total    pct nodes_y
  <fct>           <chr>     <int> <int>  <dbl> <fct>  
1 ER+, PR-, HER2- 0            10    46 0.217  0      
2 ER+, PR-, HER2- 1            25    46 0.543  1      
3 ER+, PR-, HER2- 2             3    46 0.0652 2      
4 ER+, PR-, HER2- 3             8    46 0.174  3      
5 ER+, PR+, HER2- 0            37    94 0.394  0      
6 ER+, PR+, HER2- 1            40    94 0.426  1      
group_counts_nodes
# A tibble: 8 × 2
  combo           N_group
  <fct>             <int>
1 ER+, PR-, HER2-      46
2 ER+, PR+, HER2-      94
3 ER+, PR-, HER2+       8
4 ER+, PR+, HER2+      16
5 ER-, PR+, HER2-       8
6 ER-, PR+, HER2+       3
7 ER-, PR-, HER2+      32
8 ER-, PR-, HER2-      71
Graph - Bubble plot - Lymph Node Status Before Treatment by ER/PR/HER2 Genotypes
gg_17 <- ggplot() +
  # Bubbles: size & colour = percentage
  geom_point(
    data = nodes_summary,
    aes(
      x = combo,
      y = nodes_y,
      size = pct, 
      colour = pct),
    alpha = 0.9) +

  # N labels in their own row ABOVE NA
  geom_text(
    data = n_label_df,
    aes(
      x = combo,
      y = nodes_y,
      label = paste0("N=", N_group)),
    size = 3,) +

  # bubble size = percentage (hide legend)
  scale_size(
    range  = c(3, 10),
    labels = percent_format(accuracy = 1),
    guide  = "none") +

  # bubble colour = percentage (red gradient)
  scale_colour_gradient(
    low    = "#e2619f",
    high   = "#90C84C",
    labels = percent_format(accuracy = 1),
    name   = "Proportion") +

  # keep all 8 genotype groups
  scale_x_discrete(drop = FALSE) +

  # y labels: from TOP to BOTTOM = NA, 3, 2, 1, 0; "" hides N
  scale_y_discrete(
    limits = y_levels,
    labels = c("0", "1", "2", "3", "NA", "")) +

  labs(
    title = "Lymph Node Status Before Treatment by ER/PR/HER2 Genotypes",
    x     = "ER/PR/HER2 Genotypes",
    y     = "Nodes before surgery") +
  coord_cartesian(ylim = c(0.2,
                           length(y_levels) + 0.7)) +
  theme_minimal() +
  theme(axis.text.x  = element_text(angle = 45,
                                    hjust = 0.8,
                                    size = 8),
        plot.title   = element_text(hjust = 0.5),
        axis.title.x = element_text(vjust = 0))

gg_17

Save plot
save_plot(gg_17, 
          "21_key_plot_17.png")
Conclusion:

Generally it seems to be worst to have these three genotypes:

ER+, PR-, HER2-
ER-PR-, HER+2
ER-, PR-, HER2- (triple negative)

As they have more affected lymph nodes.

Running Code

Loading libraries:

library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))

Read data:

dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))

Re-ordering

Ordering the different genotypes into groups from the augmented data:

# Create ER/PR/HER2 combo groups
dataset <- dataset |>
  mutate(
    ER = if_else(
      er_status == "P", 
      "ER+",
      "ER-"),
    PR = if_else(
      pr_status == "P",
      "PR+", 
      "PR-"),
    HER2 = if_else(
      her2_status == "P",
      "HER2+",
      "HER2-"),
    combo = paste(
      ER,
      PR,
      HER2,
      sep = ", ")) |>
  mutate(
    combo = factor(
      combo,
      levels = c(
        "ER+, PR-, HER2-",
        "ER+, PR+, HER2-",
        "ER+, PR-, HER2+",
        "ER+, PR+, HER2+",
        "ER-, PR+, HER2-",
        "ER-, PR+, HER2+",
        "ER-, PR-, HER2+",
        "ER-, PR-, HER2-")))

# check: how many rows total?
nrow(dataset)
[1] 278
# check: how many patients in each genotype?
dataset |> 
  count(combo)
# A tibble: 8 × 2
  combo               n
  <fct>           <int>
1 ER+, PR-, HER2-    46
2 ER+, PR+, HER2-    94
3 ER+, PR-, HER2+     8
4 ER+, PR+, HER2+    16
5 ER-, PR+, HER2-     8
6 ER-, PR+, HER2+     3
7 ER-, PR-, HER2+    32
8 ER-, PR-, HER2-    71

Graph 5:

Prep: treatment among genotypes
# Clean treatment variable
dat_treat <- dataset |>
  mutate(
    treatment = as.character(treatment),
    treatment = if_else(
      is.na(treatment),
      "NA",
      treatment))

# Summarise: count + proportions per genotype × treatment
treat_summary <- dat_treat |>
  group_by(combo,
           treatment) |>
  summarise(n = n(),
            .groups = "drop") |>
  group_by(combo) |>
  mutate(
    total = sum(n),
    pct   = n / total) |>
  ungroup()

# N per combo (for labels)
group_counts_treat <- treat_summary |>
  distinct(combo,
           total) |>
  rename(N_group = total)

# Keep the same genotype order you already defined
treat_summary$combo <- factor(
  treat_summary$combo,
  levels = levels(dataset$combo))
Graph - Treatment Distribution Across ER/PR/HER2 Genotypes
# Construct color palett
treatment_colors <- c(
  "FAC"        = "#7f3667",
  "FACT"       = "#bb437e",
  "FACT+XRT/X" = "#e44b8d",
  "FEC"        = "#e2619f",
  "TFAC"       = "#e5a0c6",
  "FECT"       = "lightblue",
  "TFAC/HT"    = "#B3D84C",
  "TFEC"       = "#90C84C",
  "TH/FAC"     = "#70B83F",
  "TH/FEC"     = "#4F9A2F",
  "Tonly"      = "#1D6510",
  "TXFAC"      = "#004400",
  "NA"         = "grey")
gg_18 <- ggplot(
  treat_summary,
  aes(
    x = combo,
    y = pct,
    fill = treatment)) +
  
  # N labels above bars 
  geom_text(
    data = group_counts_treat,
    aes(
      x = combo,
      y = 1.07,                     
      label = paste0("N=",
                     N_group)),
    size = 3,
    inherit.aes = FALSE) +
  
  # Stacked bars
  geom_col(position = "fill") +
  
  # Percent labels
  geom_text(
    aes(
      label = ifelse(
        pct > 0.02,
        paste0(round(pct * 100), "%"),
        "")),
    position = position_fill(vjust = 0.5),
    size = 2.5) +
  
  # Apply grey for NA + colors for other treatments
  scale_fill_manual(values = treatment_colors) +

  scale_x_discrete(drop = FALSE) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title = "Treatment Distribution Across ER/PR/HER2 Genotypes",
    x = "ER/PR/HER2 Genotype",
    y = "Proportion of Patients",
    fill = "Treatment") +
  theme_minimal() +
  theme(axis.text.x  = element_text(angle = 20,
                                    hjust = 1, 
                                    size = 8),
        plot.title   = element_text(hjust = 0.5),
        axis.title.x = element_text(vjust = 0),
        
        legend.position = "bottom",
        legend.title = element_text(size = 10),
        legend.text  = element_text(size = 8),
        legend.box.spacing = unit(0.2, "lines"),
        legend.key.size  = unit(3.5, "mm")) +
  guides(fill = guide_legend(
    ncol = 7,
    byrow = TRUE,
    title.position = "top")) +
  coord_cartesian(ylim = c(0, 1.1))

gg_18

Save plot
save_plot(gg_18, 
          "22_key_plot_18.png")
Conclusion:

Generally, more different treatments were tried in the last two groups, especially for the second last group, which also had the best response. The second group from the left (ER+, PR+, HER2-) also had more treatments but showed a bad response. So in conclusion, (1) more personalized treatment options give better response, (2) and a general broad TFAC and TFEC is not working for estrogen-positive patients.

Aggressiveness, tumor size, and nodes do not necessarily mean a bad outcome. With personalized treatments, it can mean good results for the patient even if the stage or grade of the tumor seems to have a theoretically bad prognosis.

Load Libraries

library("tidyverse")
library("here")
source(here("R/99_proj_func.R"))

Load Data

load the data as a variable to use in descriptive graphs

augment_data <- 
  read_tsv(file = here("data/03_dat_aug.tsv"))

Age vs. Genotype

df_plot1 <- augment_data |>
  mutate(
    age_group = cut(
      age,
      breaks = seq(0, 100, by = 10),
      labels = paste(seq(0, 90, 10),
                     seq(9, 99, 10),
                     sep = "-"))) |>
  filter(!is.na(age_group)) |>
  count(age_group,
        receptor_group)

custom_palette <- c("#a53e76",
                    "#d24787",
                    "#e2619f",
                    "#e5a0c6",
                    "#B3D84C",
                    "#90C84C",
                    "#4F9A2F",
                    "#1D6510")

gg_19 <- ggplot(
  df_plot1,
  aes(
    x = age_group,
    y = n,
    fill = receptor_group)) +
  geom_col(position = "stack") +
  labs(
    title = "Distribution of Age Groups and Genotypes",
    y = "Amount",
    x = "Age groups",
    fill = "Receptor groups") +
  scale_fill_manual(values = custom_palette) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

gg_19

Save plot

save_plot(gg_19, 
          "23_key_plot_19.png")

Table of Contents of Analysis files:

  • 01 - E and P receptor statuses vs cancer aggressiveness

  • 02 - HER2 receptor status vs cancer aggressiveness

  • 03 - HER2 receptor status vs tumor size before treatment

  • 04 - HER2 receptor status vs nodes before treatment

  • 05 - E and P receptor statuses vs treatment outcome

  • 06 - HER2 receptor status vs treatment outcome

  • 07 - Cancer aggressiveness vs treatment outcome

  • 08 - Age vs. Outcome

  • 09 - Age and Race vs. Aggressivness and Tumor Size

  • 10 - Tumorsize, Lymphnodes and Aggression Comparison

  • 11 - Response to Treatment according to Receptor Status

  • 12 - Her2 fish vs. Her2 receptor positive or negative and linear regression of her2 and tumor size

  • 13 - PCA with colouring according to receptor positiveness or negativeness

  • 14 - Response (pCR vs RD) by ER/PR/HER2 Genotypes

  • 15 - Tumor Grade (1–3) by ER/PR/HER2 Genotypes

  • 16 - Tumor Size Before Treatment by ER/PR/HER2 Genotypes

  • 17 - Lymph Node Status Before Treatment by ER/PR/HER2 Genotypes

  • 18 - Treatment Distribution Across ER/PR/HER2 Genotypes

  • 19 - Age and Genotype Comparison